# Load in the whole Alzheimers data file
data <- read.csv('~/repos/portfolio/demonstrative/R/datasets/alzheimers/alzheimers.csv')
# Remove this field ... I forgot to do that when creating the dataset
data <- data %>% select(-male)
# Normalize gender labels
stopifnot(all(!is.na(data$gender)))
normalize.gender <- function(x) {
is.male <- x %>% tolower %>% str_detect('^m')
ifelse(is.male, 'Male', 'Female') %>% factor
}
data <- data %>% mutate(gender=normalize.gender(gender))
# Convert integer fields to numeric for the sake of consistency
data <- data %>% mutate_each_(funs(as.numeric), c('Betacellulin', 'Eotaxin_3'))
head(data)
X <- data %>% select(-response)
y <- data[,'response']
table(y)
y
Impaired NotImpaired
91 242
names(X)
[1] "ACE_CD143_Angiotensin_Converti" "ACTH_Adrenocorticotropic_Hormon"
[3] "AXL" "Adiponectin"
[5] "Alpha_1_Antichymotrypsin" "Alpha_1_Antitrypsin"
[7] "Alpha_1_Microglobulin" "Alpha_2_Macroglobulin"
[9] "Angiopoietin_2_ANG_2" "Angiotensinogen"
[11] "Apolipoprotein_A_IV" "Apolipoprotein_A1"
[13] "Apolipoprotein_A2" "Apolipoprotein_B"
[15] "Apolipoprotein_CI" "Apolipoprotein_CIII"
[17] "Apolipoprotein_D" "Apolipoprotein_E"
[19] "Apolipoprotein_H" "B_Lymphocyte_Chemoattractant_BL"
[21] "BMP_6" "Beta_2_Microglobulin"
[23] "Betacellulin" "C_Reactive_Protein"
[25] "CD40" "CD5L"
[27] "Calbindin" "Calcitonin"
[29] "CgA" "Clusterin_Apo_J"
[31] "Complement_3" "Complement_Factor_H"
[33] "Connective_Tissue_Growth_Factor" "Cortisol"
[35] "Creatine_Kinase_MB" "Cystatin_C"
[37] "EGF_R" "EN_RAGE"
[39] "ENA_78" "Eotaxin_3"
[41] "FAS" "FSH_Follicle_Stimulation_Hormon"
[43] "Fas_Ligand" "Fatty_Acid_Binding_Protein"
[45] "Ferritin" "Fetuin_A"
[47] "Fibrinogen" "GRO_alpha"
[49] "Gamma_Interferon_induced_Monokin" "Glutathione_S_Transferase_alpha"
[51] "HB_EGF" "HCC_4"
[53] "Hepatocyte_Growth_Factor_HGF" "I_309"
[55] "ICAM_1" "IGF_BP_2"
[57] "IL_11" "IL_13"
[59] "IL_16" "IL_17E"
[61] "IL_1alpha" "IL_3"
[63] "IL_4" "IL_5"
[65] "IL_6" "IL_6_Receptor"
[67] "IL_7" "IL_8"
[69] "IP_10_Inducible_Protein_10" "IgA"
[71] "Insulin" "Kidney_Injury_Molecule_1_KIM_1"
[73] "LOX_1" "Leptin"
[75] "Lipoprotein_a" "MCP_1"
[77] "MCP_2" "MIF"
[79] "MIP_1alpha" "MIP_1beta"
[81] "MMP_2" "MMP_3"
[83] "MMP10" "MMP7"
[85] "Myoglobin" "NT_proBNP"
[87] "NrCAM" "Osteopontin"
[89] "PAI_1" "PAPP_A"
[91] "PLGF" "PYY"
[93] "Pancreatic_polypeptide" "Prolactin"
[95] "Prostatic_Acid_Phosphatase" "Protein_S"
[97] "Pulmonary_and_Activation_Regulat" "RANTES"
[99] "Resistin" "S100b"
[101] "SGOT" "SHBG"
[103] "SOD" "Serum_Amyloid_P"
[105] "Sortilin" "Stem_Cell_Factor"
[107] "TGF_alpha" "TIMP_1"
[109] "TNF_RII" "TRAIL_R3"
[111] "TTR_prealbumin" "Tamm_Horsfall_Protein_THP"
[113] "Thrombomodulin" "Thrombopoietin"
[115] "Thymus_Expressed_Chemokine_TECK" "Thyroid_Stimulating_Hormone"
[117] "Thyroxine_Binding_Globulin" "Tissue_Factor"
[119] "Transferrin" "Trefoil_Factor_3_TFF3"
[121] "VCAM_1" "VEGF"
[123] "Vitronectin" "von_Willebrand_Factor"
[125] "age" "tau"
[127] "p_tau" "Ab_42"
[129] "Genotype" "gender"
X %>% sapply(class) %>% table
.
factor numeric
2 128
#X %>% sapply(class) %>% .[. == 'factor']
names(X)[sapply(X, class) == 'factor']
[1] "Genotype" "gender"
We could start by looking at the relationship between some of the more intuitive variables like Age, Gender, and Genotype and impairment, to see if there are any obvious relationships.
Age vs Impairment
data %>%
mutate(age_range=cut(age, breaks=5)) %>%
group_by(age_range, response) %>% tally %>%
plot_ly(x=~age_range, y=~n, color=~response, type='bar') %>%
plotly::layout(hovermode='closest')
minimal value for n is 3, returning requested palette with 3 different levels
minimal value for n is 3, returning requested palette with 3 different levels
Genotype vs Impairment
data %>%
group_by(Genotype, response) %>% tally %>%
plot_ly(x=~Genotype, y=~n, color=~response, type='bar') %>%
plotly::layout(hovermode='closest')
minimal value for n is 3, returning requested palette with 3 different levels
minimal value for n is 3, returning requested palette with 3 different levels
Gender vs Impairment
data %>%
group_by(gender, response) %>% tally %>%
plot_ly(x=~gender, y=~n, color=~response, type='bar') %>%
plotly::layout(hovermode='closest')
minimal value for n is 3, returning requested palette with 3 different levels
minimal value for n is 3, returning requested palette with 3 different levels
{r} # data %>% plot_ly(x=~gender, y=~age, type='box') #Now what about the other ~125 variables? We can’t look at them all one at a time so perhaps there is a way to “condense” them into something more manageable:
library(corrplot)
d.pca <- X %>% select(-gender, -Genotype)
cor.mat <- corrplot(cor(d.pca), order='hclust', tl.col='black', tl.cex=.5)
# Extract the variable names from the figure below since they will be useful
# for creating similar plots for comparison (and it's easier to compare in the same order)
cor.var <- rownames(cor.mat)
#d.pca <- X %>% select(-gender, -Genotype)
pca <- prcomp(d.pca, scale=T)
corrplot(cor(d.pca[,cor.var], pca$x[,1:25]), tl.col='black', tl.cex=.5)
d.pca.pred <- as.data.frame(predict(pca, d.pca)[,1:2])
d.pca.pred$response <- y
pca.vars <- c('tau', 'Cortisol', 'age', 'Apolipoprotein_H')
d.pca.var <- pca$rotation[,2:3] %>%
as.data.frame %>% setNames(., c('PC1', 'PC2')) %>%
add_rownames(var='variable') %>%
filter(variable %in% pca.vars)
extend.num <- function(x) unlist(lapply(x, function(v) c(0, v, NA)))
extend.var <- function(x) unlist(lapply(x, function(v) c(NA, v, NA)))
d.pca.var <- data.frame(
variable=extend.var(d.pca.var$variable),
PC1=extend.num(d.pca.var$PC1),
PC2=extend.num(d.pca.var$PC2)
)
# Parameters for axis with no grid lines, ticks or labels
empty.axis <- list(
title = '',
zeroline = FALSE,
showline = FALSE,
showticklabels = FALSE,
ticklen = 0,
showgrid = FALSE
)
# Create a line plot of each variable showing which direction it moves within our 2D space
p1 <- plot_ly(
d.pca.var, x=~PC1, y=~PC2, text=~variable, type='scatter',
mode='lines+text', opacity=1, line=list(color='black'), textfont=list(color='white', size=14)
) %>% layout(
xaxis=list(range = c(-.5, .5), showgrid=F, zeroline=F),
yaxis=list(showgrid=F, zeroline=F)
)
# Create a heatmap of impairment incidence rate across our 2D space
d.hm <- d.pca.pred %>%
mutate(PC1=as.character(cut(PC1, breaks=3)), PC2=as.character(cut(PC2, breaks=3))) %>%
group_by(PC1, PC2) %>% summarise(PCT=100*sum(response == 'Impaired')/n()) %>%
acast(PC1 ~ PC2, value.var='PCT')
p2 <- plot_ly(z=d.hm, type='heatmap', reversescale=T) %>%
layout(xaxis=empty.axis, yaxis=empty.axis)
# Overlay the above plots on top of one another
subplot(p2, p1, margin=-1) %>%
layout(
paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)',
width=750, height=500,
title='2D Projection of Correlated Features Overlayed w/ Impairment Rates'
)
# library(caret)
# set.seed(1)
# m <- train(X, y, method='rf', trControl=trainControl(method='cv', number=10))
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).